Lending club risk adjusted interest rate prediction

QUESTION phase:

define problem

MODELING phase:

clean data
explore data
build model

VALIDATION phase:

validate model

CONCLUSION phase:

analysis result

read data

In [1]:
loan <- read.csv("loan.csv", stringsAsFactors = FALSE)
head(loan, 3)
idmember_idloan_amntfunded_amntfunded_amnt_invtermint_rateinstallmentgradesub_gradetotal_bal_ilil_utilopen_rv_12mopen_rv_24mmax_bal_bcall_utiltotal_rev_hi_liminq_fitotal_cu_tlinq_last_12m
1077501 1296599 5000 5000 4975 36 months10.65 162.87 B B2 NA NA NA NA NA NA NA NA NA NA
1077430 1314167 2500 2500 2500 60 months15.27 59.83 C C4 NA NA NA NA NA NA NA NA NA NA
1077175 1313524 2400 2400 2400 36 months15.96 84.33 C C5 NA NA NA NA NA NA NA NA NA NA

QUESTION phase

rough ideas :
    predict the grade or sub_grade assigned by LC(classification problem)
    predict the loan_status (classification problem)
    predict int_rate(interest rate loan) (regression problem)
these problem are interested by borrowers

MODEL phase

rough ideas:
clean data
explore data
    variable identification
    missing value treatment
    variable analysis
    feature engineering
build a regression model

clean data

since this data is already table like, we can skip this step

explore data

variable identification
    type of variable: predictors(features), response(target)
    data type: character, numeric, object
    variable category: continuous, categorical

type of variable

In [2]:
# int_rate is the response
response = loan$int_rate
# all others are features, where `which` find the index of column "int_rate"
features = loan[, -which(names(loan) == "int_rate")]

data type

numerical
    continous : e.g. int_rate
    discrete : e.g. inq_fi(Number of personal finance inquiries)
categorical
    ordinal : e.g. grade
    nominal : e.g. zip code
ATTENTION: data type may not like the type it loaded into
In [3]:
# load library
library(ggplot2)

# apply function class to all load columns
types = sapply(loan, class)

# convert types to data.frame
# `table` calculate the frequency
df.types = data.frame(table(types), row.names = NULL)

# visualization
options(repr.plot.width=4, repr.plot.height=2)
ggplot(data = df.types,
       aes(x = types, y = Freq/sum(Freq))) + 
geom_bar(stat = "identity", width = 0.2)
In [4]:
# Check variable one by one

# addr_state(categorical: nomial)
# emp_length(numeric: discrete ) change from string to integer
substr(loan$emp_length[1], 1, 2) # not numerical
    # deinfe a function extract numbers from string
str2num <- function(str){as.numeric(gsub("([^0-9])", "", str))}
loan$emp_length = sapply(loan$emp_length, str2num)
# emp_title(categorical: nominal)
# home_ownership (categorical: nominal)
# member_id(categorical: nominal)
# zip_code(categorical: nominal)
# annual_inc(numerical: continuous)
# annual_inc_joint(numerical: continuous)
# is_inc_v(binary) : missing feature
# verified_status_joint(categorical: nominal)
    # unique(loan$verification_status_joint)
# dti(numerical: continous)
# dti_joint(numerical: continous)
# earliest_cr_line(categorica:nominal) could be splitted to year and mon
library(zoo)
loan$earliest_cr_line_date = as.Date(as.yearmon(loan$earliest_cr_line, "%b-%Y"))
loan$earliest_cr_line_year = format(loan$earliest_cr_line_date, "%Y")
loan$earliest_cr_line_mon = format(loan$earliest_cr_line_date, "%m")
# fico_range_high: missing
# fico_range_low: missing
# last_fico_range_high: missing
# last_fico_range_low: missing
# inq_fi(numerical : discrete)
# inq_last_12m(numerical : discrete)
# inq_last_6mths(numerical : discrete)
# last_credit_pull_d(categorical) could be splitted to year and mon
loan$last_credit_pull_d_date = as.Date(as.yearmon(loan$last_credit_pull_d, "%b-%Y"))
loan$last_credit_pull_d_year = format(loan$last_credit_pull_d_date, "%Y")
loan$last_credit_pull_d_mon = format(loan$last_credit_pull_d_date, "%m")
# total_acc(numerical : discrete)
# tot_cur_bal(numerical : continous)   
# all_util(numerical : continous)
# open_acc(numerical : discrete)
# total_acc(numerical : discrete)
# tot_cur_bal(numerical : continous)
# all_util(numerical : continous)
# open_acc(numerical : discrete)
# open_acc_6m(numerical : discrete)
# total_cu_tl(numerical : discrete)
# acc_noew_delinq(numerical : discrete)
# delinq_2yrs(numerical : discrete)
# mths_since_last_delinq(numerical : discrete)
# collections_12_mths_ex_med(numerical : discrete)
# tot_coll_amt(numerical : discrete)
# pub_rec(numerical : discrete)
# mths_since_last_major_derog(numerical : discrete)
# mths_since_last_record(numerical : discrete)
# il_util(numerical : continuous)
# mths_since_rcnt_il(numerical : discrete)
# open_il_12m(numerical : discrete)
# open_il_24m(numerical : discrete)
# open_il_6m(numerical : discrete)
# total_bal_il(numerical : continuous)
# max_bal_bc(numerical : continuous)
# open_rv_12m(numerical : discrete)
# open_rv_24m(numerical : discrete)
# revol_bal(numerical : continuous)
# revol_util(numerical : continuous)
# total_rev_hi_lim(numerical : continuous)
# policy_code(binary) : no use, unique value
# url(string): not useful
# application_type(categorical : nominal)
# desc(string) might be useful, but need nlp
# id : not useful
# purpose(categorical : nominal)
# term(categorical : nominal)
# title(string) might be useful, but need nlp
# issue_d(categorical) could be splitted to year and mon
loan$issue_d_date = as.Date(as.yearmon(loan$issue_d, "%b-%Y"))
loan$issue_d_year = format(loan$issue_d_date, "%Y")
loan$issue_d_mon = format(loan$issue_d_date, "%m")
# inital_list_status(binary)
# loan_amnt(numerical : continuous)
# installment(numerical : continuous)
# funded_amnt(numerical : continuous)
# funded_amnt_inv(numerical : continuous)
# last_pymnt_amnt(numerical : continuous)
# last_pymnt_d(categorical)
loan$last_pymnt_d_date = as.Date(as.yearmon(loan$last_pymnt_d, "%b-%Y"))
loan$last_pymnt_d_year = format(loan$last_pymnt_d_date, "%Y")
loan$last_pymnt_d_mon = format(loan$last_pymnt_d_date, "%m")
# next_pymnt_d(categorical)
loan$next_pymnt_d_date = as.Date(as.yearmon(loan$next_pymnt_d, "%b-%Y"))
loan$next_pymnt_d_year = format(loan$next_pymnt_d_date, "%Y")
loan$next_pymnt_d_mon = format(loan$next_pymnt_d_date, "%m")
# pymnt_plan(binary)
# recoveries(numerical : continuous)
# total_pymnt(numerical : continuous)
# total_pymnt_inv(numerical : continuous)
# total_rec_int(numerical : continuous)
# total_rec_late_fee(numerical : continuous)
# total_rec_prncp(numerical : continuous)
# collection_recovery_fee(numerical : continuous)
# out_prncp(numerical : continuous)
# out_prncp_inv(numerical : continuous)

# Potential response variables -------------------------------

# grade(categorical : ordinal)
#   plot(table(loan$grade))

# sub_grade(categorical : ordinal)
#   plot(table(loan$sub_grade))

# int_rate(numerical : continuous)
#   hist(loan$int_rate)

# loan_status(categorical : nominal)
#   unique(loan$loan_status)
'10'
Attaching package: ‘zoo’

The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric

variable analysis

univariate analysis
▪   continuous
▪   categorical
bi-variate analysis
▪   continuous & continuous
▪   categorical & categorical
▪   continuous & categorical

common analysis tool:

•   common statistics : mean, median, skewness, ...
•   data visualization : barplot, histogram, density plot, ...
•   hpyothesis test : Chi2-test, anova, t-test, ...

Since we focus on int_rate, which is response, basicly, we can think, what we are doing is try to estimate the density of P(response|features)P(response|features) int_rate : Interest Rate on the loan since this is continuous variable, we can look at its basic stats, and histogram, density plot, boxplot

In [5]:
# stats
"range and quantile"
summary(loan$int_rate)

"sd"
sd(loan$int_rate)
'range and quantile'
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5.32    9.99   12.99   13.25   16.20   28.99 
'sd'
4.3818674147124

analysis: positive skew, but not strong, which means we could could use nomarl distribution to model this curve, but we need more analysis

In [6]:
plothistdensity <- function(data, nbins){
    options(repr.plot.width=8, repr.plot.height=2)
    p <- ggplot(data, aes(x=int_rate, y = ..density..))
    p <- p + geom_histogram(aes(y=..count../sum(..count..)),
                           bins = nbins,
                           color = "grey",
                           fill="orange")
    p <- p + geom_density()
    p + theme_dark() + 
    ylab("Normalized_Count") + 
    ggtitle(paste(paste("int_rate : ", nbins), " bins histogram with density" ))  
}

plothistdensity(loan, 10)
plothistdensity(loan, 20)
plothistdensity(loan, 50)
plothistdensity(loan, 100)

analysis: usually histogram could roughly estimate pdf of a random variable, however, to get more stable and generalize result we could use KDE to estimate density of a random variable. From the density plot, we observe many peaks, which indicates this a multimodal distribution. This means there are different patterns inside the distribution, usually, it is a mixture distribution. This pattern may cause by groups(usage, years, repuations, states, ...)

Obvious groups in the features are categorical variables

Group analysis on int_rate

1. int_rate with grade

int_rate : Interest Rate on the loan

grade : LC assigned loan grade
In [7]:
options(repr.plot.width=4, repr.plot.height=2)
ggplot(data.frame(table(loan$grade)), aes(x=Var1, y=Freq)) + geom_bar(stat = "identity")
# group histogram
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, fill=grade)) + 
geom_histogram(bins = 100, color = "white") + theme_light() + 
geom_freqpoly(bins = 30, color = "grey")
In [8]:
# group density
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=grade)) + 
theme_light() + 
geom_density(bw = 0.3, alpha= 0.6)

Even we group int_rate by grade, it still shows multimodality inside each group, we can dig into it latter, but, clearly it shows grade was directly related to int_rate

In [9]:
# boxplot 1
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) + 
geom_boxplot() + labs(x='grade') + 
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")

box plot shows the skewness, quantile, outliers of each group, but it cannot show the amount of data in each group

In [10]:
table(loan$grade)
     A      B      C      D      E      F      G 
148202 254535 245860 139542  70705  23046   5489 
In [11]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) + 
geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot(color='steelblue') + 
labs(x='grade')

we can also use a violin plot to compare multiple distribution in different group

In [12]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(grade), y=int_rate)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) #+ geom_boxplot(width=.1, fill="black", outlier.colour=NA) +
#stat_summary(fun.y=median, geom="point", fill="white", shape=21, size=2.5)

2. int_rate with subgrade

 int_rate : Interest Rate on the loan

 subgrade : LC assigned loan subgrade
In [13]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=int_rate, y=..density.., fill = sub_grade)) +
geom_density(bw = 0.1, alpha= 0.6) + facet_grid(grade ~ .)

3. int_rate with loan_status

int_rate : Interest Rate on the loan
loan_status : Current status of the loan
In [14]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=factor(loan_status), y=int_rate)) + 
geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot(color='steelblue') + 
labs(x='loan_status') + 
theme(axis.text.x = element_text(angle=45, hjust=1, vjust=0.8),legend.position = "none")
In [15]:
options(repr.plot.width=8, repr.plot.height=6)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=loan_status)) + 
theme_light() + 
facet_grid(loan_status ~ .) +
geom_density(alpha= 0.6)

int_rate also influence by loan_status, and within each loan_status group, it appears more like a normal

USER FEATURES vs int_rate

4. addr_state with int_rate

In [16]:
library(choroplethr)
library(choroplethrMaps)

state.ave.int_rate = as.list(by(loan$int_rate, loan$addr_state, mean))
state.names = names(state.ave.int_rate)
names(state.ave.int_rate) = NULL
state.int_rate.df = data.frame(region = state.names, value = unlist(state.ave.int_rate))

full.name = c()
for(i in 1:51)
{
    if(i==8){
        full.name <- append(full.name, "district of columbia")
    }else{
        full.name <- append(full.name, tolower(state.name[state.int_rate.df[i,1] == state.abb]))
    }
}

state.int_rate.df$region = full.name

options(repr.plot.width=16, repr.plot.height=8)
state_choropleth(state.int_rate.df)
Loading required package: acs
Loading required package: stringr
Loading required package: XML

Attaching package: ‘acs’

The following object is masked from ‘package:base’:

    apply

In [17]:
# anova analysis on different groups
aov.res = aov(int_rate ~ factor(addr_state), na.action = na.omit, data = loan)
summary(aov.res)
                       Df   Sum Sq Mean Sq F value Pr(>F)    
factor(addr_state)     50    26795   535.9   27.95 <2e-16 ***
Residuals          887328 17011539    19.2                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the groups. 

possible feature enigneering:
    group median of int_rate on each state
    set level of int_rate on each state

5. emp_length with int_rate

emp_length with int_rate : Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
In [18]:
unique(loan$emp_length)
  1. 10
  2. 1
  3. 3
  4. 8
  5. 9
  6. 4
  7. 5
  8. 6
  9. 2
  10. 7
  11. <NA>
In [19]:
loan[which(loan$emp_length == "n/a"), "emp_length"] <- NA
sum(is.na(loan$emp_length))/dim(loan)[1]
0.0505139292230265
In [20]:
# group density
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=emp_length)) + 
theme_light() + 
facet_grid(emp_length ~ .) +
geom_density(alpha= 0.6)
In [21]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(emp_length), y=int_rate)) + 
geom_boxplot() + labs(x='emp_length') + 
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
In [22]:
# anova analysis on different groups
aov.res = aov(int_rate ~ factor(emp_length), na.action = na.omit, data = loan, )
summary(aov.res)
                       Df   Sum Sq Mean Sq F value Pr(>F)    
factor(emp_length)      9     4667   518.6   26.97 <2e-16 ***
Residuals          842544 16202138    19.2                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
44825 observations deleted due to missingness
In [23]:
# Tukey test to find significance between groups
tukey.res = TukeyHSD(aov.res, conf.level = 0.95)                   
tukey.res
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = int_rate ~ factor(emp_length), data = loan, na.action = na.omit)

$`factor(emp_length)`
              diff          lwr         upr     p adj
2-1   0.0275563392 -0.035273391  0.09038607 0.9313745
3-1   0.0105064267 -0.054729933  0.07574279 0.9999664
4-1   0.0533839088 -0.018527908  0.12529573 0.3574399
5-1   0.1029239235  0.032479163  0.17336868 0.0001638
6-1   0.2421105663  0.164725163  0.31949597 0.0000000
7-1   0.2887307876  0.212420278  0.36504130 0.0000000
8-1   0.1030346301  0.026314103  0.17975516 0.0008972
9-1   0.1289560620  0.044927387  0.21298474 0.0000526
10-1  0.0956418964  0.049087233  0.14219656 0.0000000
3-2  -0.0170499125 -0.089084135  0.05498431 0.9991740
4-2   0.0258275696 -0.052303448  0.10395859 0.9893992
5-2   0.0753675843 -0.001415297  0.15215047 0.0594696
6-2   0.2145542271  0.131357789  0.29775066 0.0000000
7-2   0.2611744484  0.178976879  0.34337202 0.0000000
8-2   0.0754782909 -0.007100071  0.15805665 0.1077565
9-2   0.1013997228  0.011990748  0.19080870 0.0123416
10-2  0.0680855571  0.012403574  0.12376754 0.0043200
4-3   0.0428774821 -0.037201626  0.12295659 0.7992928
5-3   0.0924174968  0.013653174  0.17118182 0.0078476
6-3   0.2316041395  0.146575586  0.31663269 0.0000000
7-3   0.2782243608  0.194172900  0.36227582 0.0000000
8-3   0.0925282034  0.008104312  0.17695210 0.0188108
9-3   0.1184496352  0.027333377  0.20956589 0.0016043
10-3  0.0851354696  0.026751474  0.14351947 0.0001710
5-4   0.0495400147 -0.034836198  0.13391623 0.6979490
6-4   0.1887266575  0.098474889  0.27897843 0.0000000
7-4   0.2353468788  0.146015055  0.32467870 0.0000000
8-4   0.0496507213 -0.040031608  0.13933305 0.7656344
9-4   0.0755721532 -0.020436696  0.17158100 0.2735067
10-4  0.0422579876 -0.023500796  0.10801677 0.5754042
6-5   0.1391866428  0.050099401  0.22827388 0.0000338
7-5   0.1858068640  0.097651718  0.27396201 0.0000000
8-5   0.0001107066 -0.088399605  0.08862102 1.0000000
9-5   0.0260321384 -0.068882845  0.12094712 0.9973639
10-5 -0.0072820272 -0.071433197  0.05686914 0.9999984
7-6   0.0466202213 -0.047174054  0.14041450 0.8616264
8-6  -0.1390759362 -0.233204101 -0.04494777 0.0001277
9-6  -0.1131545043 -0.213328805 -0.01298020 0.0129887
10-6 -0.1464686699 -0.218172289 -0.07476505 0.0000000
8-7  -0.1856961575 -0.278942630 -0.09244968 0.0000000
9-7  -0.1597747256 -0.259121007 -0.06042844 0.0000160
10-7 -0.1930888912 -0.263631092 -0.12254669 0.0000000
9-8   0.0259214319 -0.073740141  0.12558300 0.9982548
10-8 -0.0073927338 -0.078378278  0.06359281 0.9999992
10-9 -0.0333141656 -0.112141360  0.04551303 0.9451652
In [24]:
# feature engineering : split emp_length to < 5 ; >= 5
loan$emp_length_sim = NA
loan[which(is.na(loan[,'emp_length'])),'emp_length']=median(loan[,'emp_length'],na.rm=T)
loan[which(loan$emp_length %in% c('10', '8','9','5','6','7')), "emp_length_sim"] = ">= 5"
loan[which(loan$emp_length %in% c('0', '1','2','3','4')), "emp_length_sim"] = "< 5"
In [25]:
table(loan$emp_length_sim)
   < 5   >= 5 
329125 558254 
In [26]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(emp_length_sim), y=int_rate)) + 
geom_boxplot() + labs(x='emp_length_sim') + 
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
In [27]:
# t-test on group means
t.test(int_rate ~ emp_length_sim, data = loan, na.action = na.omit) # significant
	Welch Two Sample t-test

data:  int_rate by emp_length_sim
t = -11.472, df = 698310, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.1288463 -0.0912435
sample estimates:
 mean in group < 5 mean in group >= 5 
          13.17751           13.28755 

6. emp_title with int_rate

nominal
In [28]:
# display top ten emp_title
loan$emp_title = tolower(loan$emp_title)            # convert all string to lowercase
# convert " " to na
loan$emp_title[which(loan$emp_title == "")] = NA
sort(table(loan$emp_title), decreasing = T)[1:100]   # possible text processing, na exists as a level
length(unique(loan$emp_title))
# percentage of missing 
sum(is.na(loan$emp_title))/dim(loan)[1]
                 teacher                  manager                    owner 
                   16619                    14963                     8586 
        registered nurse               supervisor                   driver 
                    7721                     6937                     6215 
                   sales                       rn           office manager 
                    6208                     6017                     4687 
         project manager          general manager             truck driver 
                    4625                     4256                     3566 
                director                 engineer                president 
                    3427                     3186                     3094 
           sales manager           vice president           police officer 
                    2713                     2633                     2603 
      operations manager            store manager                 attorney 
                    2591                     2465                     2388 
              accountant administrative assistant                    nurse 
                    2353                     2307                     2160 
              technician          account manager                 mechanic 
                    2155                     2107                     1982 
       assistant manager      executive assistant                  analyst 
                    1830                     1742                     1736 
                  server           branch manager                paralegal 
                    1715                     1575                     1515 
       software engineer         customer service                  foreman 
                    1496                     1445                     1390 
              consultant              electrician               controller 
                    1386                     1383                     1380 
         program manager                 operator            administrator 
                    1338                     1292                     1240 
                   clerk        account executive                principal 
                    1237                     1196                     1156 
        machine operator         business analyst                      ceo 
                    1142                     1117                     1057 
               professor       executive director                machinist 
                    1050                     1040                     1035 
              it manager                      lpn              maintenance 
                    1034                     1015                      997 
               secretary   director of operations         district manager 
                     984                      957                      936 
              pharmacist                associate                  us army 
                     932                      930                      919 
                 laborer          legal assistant            social worker 
                     917                      915                      910 
         service manager           superintendent        financial advisor 
                     905                      900                      891 
         sales associate                  cashier                      cna 
                     886                      884                      883 
               physician     correctional officer                bartender 
                     877                      837                      824 
                 officer        financial analyst        medical assistant 
                     810                      805                      805 
      production manager                   welder               bookkeeper 
                     790                      788                      771 
             firefighter       accounting manager         security officer 
                     768                      763                      751 
               sales rep               instructor           letter carrier 
                     716                      713                      712 
      nurse practitioner         business manager    respiratory therapist 
                     711                      696                      695 
                    cook              team leader                     tech 
                     694                      690                      681 
              dispatcher         property manager           senior manager 
                     678                      663                      663 
            case manager                    agent          finance manager 
                     658                      653                      651 
         legal secretary             receptionist        warehouse manager 
                     647                      639                      639 
             underwriter 
                     635 
261562
0.0579876242282046
In [29]:
# bar plot 
emp_title.pop = sort(table(loan$emp_title)[which(table(loan$emp_title)>500)], decreasing = T)
emp_title.pop = as.data.frame(emp_title.pop)

options(repr.plot.width=30, repr.plot.height=10)
ggplot(data = emp_title.pop, aes(x=Var1, y=Freq)) + 
  geom_bar(stat="identity") + 
#    scale_y_log10() + 
    scale_x_discrete(name ="emp_titles", 
                     limits=emp_title.pop$Var1)+
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

possible feature engineering : binning

In [30]:
# generate feature emp_title
#emp_title.ave_rate = as.list(by(loan$int_rate, loan$emp_title, median))
#emp_title.ave_rate = unlist(emp_title.ave_rate)
#loan$gen_emp_title_ave_rate = unlist(apply(loan, 1, function(x){return(emp_title.ave_rate[x["emp_title"]])}))
In [31]:
#plot(table(loan$gen_empt_title_ave_rate))

7. home_ownership with int_rate¶

nominal

In [32]:
unique(loan$home_ownership)
  1. 'RENT'
  2. 'OWN'
  3. 'MORTGAGE'
  4. 'OTHER'
  5. 'NONE'
  6. 'ANY'
In [33]:
sum(loan$home_ownership == "NONE") / dim(loan)[1]
5.63457102320429e-05
In [34]:
#boxplot 1
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(home_ownership), y=int_rate)) + 
#geom_jitter(alpha=0.3, color='lightgrey') +
geom_boxplot() + labs(x='home_ownership') + 
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")

we can observe rough trend, that mortage < own < rent

In [35]:
# binning "ANY", "NONE", "OTHER"
loan$gen_home_ownership = loan$home_ownership
loan$gen_home_ownership[loan$home_ownership %in% c("ANY", "NONE", "OTHER")] = "others"
table(loan$gen_home_ownership)
MORTGAGE   others      OWN     RENT 
  443557      235    87470   356117 

8. zip code with int_rate

In [36]:
sum(is.na(loan$zip_code))/dim(loan)[1]
0
In [37]:
head(table(loan$zip_code, loan$addr_state))
       
          AK   AL   AR   AZ   CA   CO   CT   DC   DE   FL   GA   HI   IA   ID
  007xx    0    0    0    0    0    0    0    0    0    1    0    0    0    0
  008xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  010xx    0    0    0    0    0    0    1    0    0    0    0    0    0    0
  011xx    0    0    0    0    0    0    1    0    0    0    0    0    0    0
  012xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  013xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
       
          IL   IN   KS   KY   LA   MA   MD   ME   MI   MN   MO   MS   MT   NC
  007xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  008xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  010xx    0    0    0    0    0 1476    0    0    0    0    0    0    0    0
  011xx    0    0    0    0    0  412    0    0    0    0    0    0    0    0
  012xx    0    0    0    0    0  346    0    0    0    0    0    0    0    0
  013xx    0    0    0    0    0  244    0    0    0    0    0    0    0    0
       
          ND   NE   NH   NJ   NM   NV   NY   OH   OK   OR   PA   RI   SC   SD
  007xx    0    0    0    2    0    0    0    0    0    0    1    0    0    0
  008xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  010xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  011xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  012xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
  013xx    0    0    0    0    0    0    0    0    0    0    0    0    0    0
       
          TN   TX   UT   VA   VT   WA   WI   WV   WY
  007xx    0    0    0    0    0    0    0    0    0
  008xx    0    0    0    3    0    0    0    0    0
  010xx    0    0    0    0    0    0    0    0    0
  011xx    0    0    0    0    0    0    0    0    0
  012xx    0    0    0    0    0    0    0    0    0
  013xx    0    0    0    0    0    0    0    0    0

possible : feature engineering combine zip_code and addr_state to generate more accurate address

In [38]:
# possible : feature engineering combine zip_code and addr_state to generate more accurate address
loan$gen_add_state_zip = paste(loan$addr_state, as.character(loan$zip_code))
add_state_zip.ave_rate = as.list(by(loan$int_rate, loan$gen_add_state_zip, median))
add_state_zip.ave_rate = unlist(add_state_zip.ave_rate)
loan$gen_add_state_zip_ave_rate = unlist(apply(loan, 1, function(x){return(add_state_zip.ave_rate[x["gen_add_state_zip"]])}))
In [39]:
plot(table(loan$gen_add_state_zip_ave_rate))

USER FEATURES (financial specific) vs int_rate

9. annual_inc with int_rate

The self-reported annual income provided by the borrower during registration.
In [40]:
length(which(is.na(loan$annual_inc) | (loan$annual_inc == 0)))/dim(loan)[1]

loan$annual_inc[which(is.na(loan$annual_inc) | (loan$annual_inc == 0))] = mean(loan$annual_inc, na.rm=T)
#loan$annual_inc[which(is.na(loan$annual_inc) == 0)] = mean(loan$annual_inc, na.rm=T)
range(loan$annual_inc)

# linear relationship
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = annual_inc, y=int_rate)) + geom_hex() + stat_smooth(method=lm)
6.76148522784515e-06
  1. 1200
  2. 9500000
In [41]:
ggplot(loan, aes(x = annual_inc, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon")
In [42]:
# linear relationship in log scale : possible feature engineering
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = log(annual_inc), y=int_rate)) + geom_hex() + stat_smooth(method=lm)
In [43]:
cor(loan$annual_inc, loan$int_rate)
# possible feature
cor(log(loan$annual_inc), loan$int_rate)
-0.0727845648392547
-0.110644975032076
In [44]:
# new feature
loan$gen_log_annual_inc = log(loan$annual_inc)

10. annual_inc_joint with int_rate

numerical: continuous
In [45]:
# drop this feature
sum(is.na(loan$annual_inc_joint))/dim(loan)[1]
0.999424146841429

11. verified_status_joint with int_rate

categorical: nominal
In [46]:
sum(is.na(loan$verification_status))/dim(loan)[1]
unique(loan$verification_status)
0
  1. 'Verified'
  2. 'Source Verified'
  3. 'Not Verified'
In [47]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=factor(verification_status), y=int_rate)) + 
geom_boxplot() + labs(x='verification_status') + 
stat_summary(fun.y="mean", geom="point", shape=23, size=3, fill="white")
In [48]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=int_rate, y= ..density.., fill=verification_status)) + 
theme_light() + 
geom_density(bw = 0.3, alpha= 0.6)
In [49]:
aov.res = aov(int_rate ~ factor(verification_status), na.action = na.omit, data = loan, )
summary(aov.res)
                                Df   Sum Sq Mean Sq F value Pr(>F)    
factor(verification_status)      2  1080177  540089   30032 <2e-16 ***
Residuals                   887376 15958157      18                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the different verification status.

12. dti with int_rate

dti : A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.

In [50]:
quantile(loan$dti)
0%
0
25%
11.91
50%
17.65
75%
23.95
100%
9999
In [51]:
# weak linear relationship
cor(loan$dti, loan$int_rate)
0.0799025511904788

13. dti_joint with int_rate

numerical: continuous drop this feature

In [52]:
sum(is.na(loan$dti_joint))/dim(loan)[1]
0.999426400669838

14. inq_fi with int_rate

inq_fi : Number of personal finance inquiries

In [53]:
# drop this feature
sum(is.na(loan$inq_fi))/dim(loan)[1]
0.975915589618416

15. inq_last_12m with int_rate

inq_last_12m : Number of credit inquiries in past 12 months

In [54]:
# drop this feature
sum(is.na(loan$inq_last_12m))/dim(loan)[1]
0.975915589618416

16. inq_last_6mths with int_rate

inq_last_6mths : The number of inquiries in past 6 months (excluding auto and mortgage inquiries)

In [55]:
sum(is.na(loan$inq_last_6mths))/dim(loan)[1]
3.26805119345849e-05
In [56]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(inq_last_6mths), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [57]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$inq_last_6mths)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=0, hjust=1, vjust=.5),
        legend.position = "none")
In [58]:
# binning: > 4 is 5
loan$gen_inq_last_6mths = loan$inq_last_6mths
loan$gen_inq_last_6mths[loan$inq_last_6mths > 4] = 5

17. last_credit_pull_d with int_rate

last_credit_pull_d : The most recent month LC pulled credit for this loan

In [59]:
sum(is.na(loan$last_credit_pull_d_date))/dim(loan)[1]
5.97264528459655e-05
In [60]:
library("zoo")
loan$last_credit_pull_d_date = as.Date(as.yearmon(loan$last_credit_pull_d, "%b-%Y"))
In [61]:
options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(last_credit_pull_d_date), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='last_credit_pull_d_date') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [62]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$last_credit_pull_d_date)), aes(x=Var1, y=log(Freq))) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

18. total_acc with int_rate

total_acc : The total number of credit lines currently in the borrower's credit file

In [63]:
sum(is.na(loan$total_acc))/dim(loan)[1]
3.26805119345849e-05
In [64]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(total_acc), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x="The total number of credit lines currently in the borrower's credit file") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [65]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$total_acc)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

could be modeled as log normal

19. tot_cur_bal with int_rate¶

tot_cur_bal : Total current balance of all accounts

In [66]:
sum(is.na(loan$tot_cur_bal))/dim(loan)[1]
0.0791950226453409
In [67]:
# linear relationship
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = tot_cur_bal, y=int_rate)) + geom_hex(bins=100) + stat_smooth(method=lm)
Warning message:
“Removed 70276 rows containing non-finite values (stat_binhex).”Warning message:
“Removed 70276 rows containing non-finite values (stat_smooth).”
In [68]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x = log(tot_cur_bal), y=int_rate)) + geom_hex(bins=100) + stat_smooth(method=lm)
Warning message:
“Removed 70402 rows containing non-finite values (stat_binhex).”Warning message:
“Removed 70402 rows containing non-finite values (stat_smooth).”
In [69]:
# new feature
loan$gen_log_tot_cur_bal = log(loan$tot_cur_bal)

20. all_util with int_rate

all_util : Balance to credit limit on all trades

In [70]:
# drop this feature
sum(is.na(loan$all_util))/dim(loan)[1]
0.975915589618416

21. open_acc with int_rate

open_acc : The number of open credit lines in the borrower's credit file.

In [71]:
sum(is.na(loan$open_acc))/dim(loan)[1]

options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(open_acc), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
3.26805119345849e-05
In [72]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$open_acc)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

22. open_acc_6m with int_rate

open_acc_6m : Number of open trades in last 6 months

In [73]:
# drop feature
sum(is.na(loan$open_acc_6m))/dim(loan)[1]
0.975915589618416

23. total_cu_tl with int_rate

total_cu_tl : Number of finance trades

In [74]:
# drop feature
sum(is.na(loan$total_cu_tl))/dim(loan)[1]
0.975915589618416

24. acc_now_delinq with int_rate

acc_now_deling : The number of accounts on which the borrower is now delinquent.

In [75]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(acc_now_delinq), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [76]:
aov.res = aov(int_rate ~ factor(acc_now_delinq), na.action = na.omit, data = loan, )
summary(aov.res)
                           Df   Sum Sq Mean Sq F value Pr(>F)    
factor(acc_now_delinq)      7    12725  1817.9   94.75 <2e-16 ***
Residuals              887342 17025096    19.2                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
29 observations deleted due to missingness

As the p-value is less than the significance level 0.001, we can conclude that there are significant differences between the different verification status.

25. delinq_2yrs with int_rate

delinq_2yrs : The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years

In [77]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(delinq_2yrs), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [78]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$delinq_2yrs)), aes(x=Var1, y=log(Freq))) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [79]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x = inq_last_6mths, y=delinq_2yrs)) + geom_hex(bins=50) + stat_smooth(method=lm)
Warning message:
“Removed 29 rows containing non-finite values (stat_binhex).”Warning message:
“Removed 29 rows containing non-finite values (stat_smooth).”

26. mths_since_last_delinq

mths_since_last_delinq : The number of months since the borrower's last delinquency.

In [80]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(mths_since_last_delinq), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [81]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$mths_since_last_delinq)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

27. collections_12_mths_ex_med

collections_12_mths_ex_med : Number of collections in 12 months excluding medical collections

In [82]:
table(loan$collections_12_mths_ex_med)
     0      1      2      3      4      5      6      7     10     14     16 
875553  10824    732     88     23      7      2      1      1      1      1 
    20 
     1 
In [83]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(collections_12_mths_ex_med), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [84]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$collections_12_mths_ex_med)), aes(x=Var1, y=log(Freq))) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

28. tot_coll_amt

tot_coll_amt : Total collection amounts ever owed

In [85]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=tot_coll_amt, y=int_rate)) + geom_hex(bins=50)
Warning message:
“Removed 70276 rows containing non-finite values (stat_binhex).”
In [86]:
# generate feature log tot_coll_amt
loan$gen_log_tot_coll_amt = log(loan$tot_coll_amt)
In [87]:
options(repr.plot.width=8, repr.plot.height=4)
tot_coll_amt.data = as.data.frame(table(loan$tot_coll_amt))
tot_coll_amt.data = as.data.frame(sapply(tot_coll_amt.data[, 1:2], as.numeric))
ggplot(tot_coll_amt.data, aes(x=Var1, y=log(Freq))) + 
geom_line()

29. pub_rec

pub_rec : Number of derogatory public records

In [88]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(pub_rec), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [89]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$pub_rec)), aes(x=Var1, y=log(Freq))) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

30. mths_since_last_major_derog

mths_since_last_major_derog : Months since most recent 90-day or worse rating

In [90]:
options(repr.plot.width=16, repr.plot.height=8)
ggplot(loan, aes(x=factor(mths_since_last_major_derog), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [91]:
options(repr.plot.width=48, repr.plot.height=8)
ggplot(as.data.frame(table(loan$mths_since_last_major_derog)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

31. mths_since_last_record

mths_since_last_record : The number of months since the last public record.

In [92]:
# drop
sum(is.na(loan$mths_since_last_record))/dim(loan)[1]
0.845553027511357

32. il_util

il_util : Ratio of total current balance to high credit/credit limit on all install acct

In [93]:
#drop
sum(is.na(loan$il_util))/dim(loan)[1]
0.979020238252201

33. mths_since_rcnt_il

mths_since_rcnt_il : Months since most recent installment accounts opened

In [94]:
# drop
sum(is.na(loan$mths_since_rcnt_il))/dim(loan)[1]
0.976548915401424

34. open_il_12m , 31. open_il_24m & open_il_6m¶

open_il_24m : Number of installment accounts opened in past 24 months

In [95]:
# drop
sum(is.na(loan$open_il_12m))/dim(loan)[1]
sum(is.na(loan$open_il_24m))/dim(loan)[1]
sum(is.na(loan$open_il_6m))/dim(loan)[1]
0.975915589618416
0.975915589618416
0.975915589618416

35. total_bal_il, max_bal_bc,open_rv_12m & open_rv_24m

total_bal_il : Total current balance of all installment accounts
max_bal_bc : Maximum current balance owed on all revolving accounts
open_rv_12m : Number of revolving trades opened in past 12 months
open_rv_24m : Number of revolving trades opened in past 24 months
In [96]:
# drop
sum(is.na(loan$total_bal_il))/dim(loan)[1]
sum(is.na(loan$max_bal_bc))/dim(loan)[1]
sum(is.na(loan$open_rv_12m))/dim(loan)[1]
sum(is.na(loan$open_rv_24m))/dim(loan)[1]
0.975915589618416
0.975915589618416
0.975915589618416
0.975915589618416

36. revol_bal

revol_bal : Total credit revolving balance

In [97]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=revol_bal, y=int_rate)) + geom_hex(bins=50)

options(repr.plot.width=8, repr.plot.height=4)
revol_bal.data = as.data.frame(table(loan$revol_bal))
revol_bal.data = as.data.frame(sapply(revol_bal.data[, 1:2], as.numeric))
ggplot(revol_bal.data, aes(x=Var1, y=log(Freq))) + 
geom_line()

37. revol_util

revol_util : Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit.

In [98]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=revol_util, y=int_rate)) + geom_hex(bins=50)

revol_util.data = as.data.frame(table(loan$revol_util))
revol_util.data = as.data.frame(sapply(revol_util.data[, 1:2], as.numeric))
ggplot(revol_util.data, aes(x=Var1, y=log(Freq))) + 
geom_line() + geom_point()
Warning message:
“Removed 502 rows containing non-finite values (stat_binhex).”

38. total_rev_hi_lim

total_rev_hi_lim : Total revolving high credit/credit limit

In [99]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x= total_rev_hi_lim, y=int_rate)) + geom_hex(bins=50)

options(repr.plot.width=8, repr.plot.height=4)
total_rev_hi_lim.data = as.data.frame(table(loan$total_rev_hi_lim))
total_rev_hi_lim.data = as.data.frame(sapply(total_rev_hi_lim.data[, 1:2], as.numeric))
ggplot(total_rev_hi_lim.data, aes(x=Var1, y=log(Freq))) + 
geom_point()
Warning message:
“Removed 70276 rows containing non-finite values (stat_binhex).”
In [100]:
# New feature
loan$gen_log_total_rev_hi_lim = log(loan$total_rev_hi_lim)

39. policy_code

policy_code : publicly available policy_code=1 only one value, drop

In [101]:
# drop 
unique(loan$policy_code)
1

40. url

url : URL for the LC page with listing data.

In [102]:
# same as id, no use
loan$url[1:10]
  1. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077501'
  2. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077430'
  3. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1077175'
  4. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1076863'
  5. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075358'
  6. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1075269'
  7. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1069639'
  8. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1072053'
  9. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071795'
  10. 'https://www.lendingclub.com/browse/loanDetail.action?loan_id=1071570'

41. application_type

application_type : Indicates whether the loan is an individual application or a joint application with two co-borrowers

In [103]:
table(loan$application_type)
INDIVIDUAL      JOINT 
    886868        511 
In [104]:
aov.res = aov(int_rate ~ factor(application_type), na.action = na.omit, data = loan, )
summary(aov.res)
                             Df   Sum Sq Mean Sq F value Pr(>F)    
factor(application_type)      1     1895  1894.7   98.69 <2e-16 ***
Residuals                887377 17036439    19.2                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
In [105]:
# split data to individual & joint

42. desc

desc : Loan description provided by the borrower

In [106]:
# need nlp
loan$desc[1:10]
  1. ' Borrower added on 12/22/11 > I need to upgrade my business technologies.<br>'
  2. ' Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike. I only need this money because the deal im looking at is to good to pass up.<br><br> Borrower added on 12/22/11 > I plan to use this money to finance the motorcycle i am looking at. I plan to have it paid off as soon as possible/when i sell my old bike.I only need this money because the deal im looking at is to good to pass up. I have finished college with an associates degree in business and its takingmeplaces<br>'
  3. ''
  4. ' Borrower added on 12/21/11 > to pay for property tax (borrow from friend, need to pay back) & central A/C need to be replace. I\'m very sorry to let my loan expired last time.<br>'
  5. ' Borrower added on 12/21/11 > I plan on combining three large interest bills together and freeing up some extra each month to pay toward other bills. I\'ve always been a good payor but have found myself needing to make adjustments to my budget due to a medical scare. My job is very stable, I love it.<br>'
  6. ''
  7. ' Borrower added on 12/18/11 > I am planning on using the funds to pay off two retail credit cards with 24.99% interest rates, as well as a major bank credit card with a 18.99% rate. I pay all my bills on time, looking for a lower combined payment and lower monthly payment.<br>'
  8. ' Borrower added on 12/16/11 > Downpayment for a car.<br>'
  9. ' Borrower added on 12/21/11 > I own a small home-based judgment collection business. I have 5 years experience collecting debts. I am now going from a home office to a small office. I also plan to buy a small debt portfolio (eg. $10K for $1M of debt) <br>My score is not A+ because I own my home and have no mortgage.<br>'
  10. ' Borrower added on 12/16/11 > I\'m trying to build up my credit history. I live with my brother and have no car payment or credit cards. I am in community college and work full time. Im going to use the money to make some repairs around the house and get some maintenance done on my car.<br><br> Borrower added on 12/20/11 > $1000 down only $4375 to go. Thanks to everyone that invested so far, looking forward to surprising my brother with the fixes around the house.<br>'

43. id

id : A unique LC assigned ID for the loan listing. no use

44. purpose

purpose : A category provided by the borrower for the loan request.

In [107]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(purpose), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of personal finance inquiries') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [108]:
aov.res = aov(int_rate ~ factor(purpose), na.action = na.omit, data = loan, )
summary(aov.res)
                    Df   Sum Sq Mean Sq F value Pr(>F)    
factor(purpose)     13   850015   65386    3584 <2e-16 ***
Residuals       887365 16188319      18                   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

45. title

title : The loan title provided by the borrower

In [109]:
loan$title = tolower(loan$title)            # convert all string to lowercase
unique(loan$title)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
In [110]:
unique(loan$title)[which(table(loan$title)>500)]
  1. 'payoff the cards'
  2. 'one debt group'
  3. 'relief assistance'
  4. 'paythatcreditcardoff!!!'
  5. 'brad\'s loan'
  6. 'consolidate and purchase engagement ring'
  7. 'john\'s personal loan'
  8. 'home improvement - patio'
  9. 'dale\'s loan'
  10. 'money 2010'
  11. 'small credit card debt consolidation'
  12. 'quick pay off'
  13. 'my wife\'s education fund'
  14. 'help this reliable partner consolidate high interest debt!'
  15. 'the backyard oasis'
  16. 'loan refinancing'
  17. 'remodel my kitchen'
  18. 'keeping credit score good'
  19. 'home theater completion'
  20. 'land investment'
  21. 'marlene'
  22. 'bali and hong kong'
  23. 'expanding business (again)'
  24. 'paying down some high rate credit cards'
  25. 'lowerrate'
  26. 'credit card debts consolidation'
  27. 'credit card bye bye'
  28. 'consilation'
  29. 'happy day'
  30. 'debt demolition'
  31. 'one loan to pay them all!'
  32. 'no more banks'
  33. 'get it done!'
  34. 'short and sweet'
  35. 'spring debt consolidation'
  36. 'debt free atlast'
  37. ' home improv '
  38. 'three years to get out of debt or die'
  39. '$$ thanks lending club $$'
  40. 'my first lending club loan'
  41. 'thank goodness!'
  42. 'upgrade and remodeling'
  43. '36 1000 loan'
  44. 'debt-free in three years'
  45. 'skim loan'
In [111]:
# bar plot 
title.pop = sort(table(loan$title)[which(table(loan$title)>500)], decreasing = T)
title.pop = as.data.frame(title.pop)

options(repr.plot.width=30, repr.plot.height=10)
ggplot(data = title.pop, aes(x=Var1, y=Freq)) + 
  geom_bar(stat="identity") + 
#    scale_y_log10() + 
    scale_x_discrete(name ="emp_titles", 
                     limits=title.pop$Var1)+
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

46. term

term : The number of payments on the loan. Values are in months and can be either 36 or 60.

In [112]:
unique(loan$term)

t.test(int_rate ~ term, data = loan, na.action = na.omit) # significant
  1. ' 36 months'
  2. ' 60 months'
	Welch Two Sample t-test

data:  int_rate by term
t = -431.12, df = 467040, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -4.111525 -4.074310
sample estimates:
mean in group  36 months mean in group  60 months 
                12.01868                 16.11160 
In [113]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(term), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

47. issue_d

issue_d : The month which the loan was funded

In [114]:
loan$issue_d_date = as.Date(as.yearmon(loan$issue_d, "%b-%Y"))

options(repr.plot.width=16, repr.plot.height=4)
ggplot(loan, aes(x=factor(issue_d_date), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

options(repr.plot.width=16, repr.plot.height=8)
ggplot(as.data.frame(table(loan$issue_d_date)), aes(x=Var1, y=Freq)) + 
geom_bar(stat="identity") + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")
In [115]:
# generate feature issue_d, issue_year, issue_mon
loan$gen_issue_d <- as.Date(as.yearmon(loan$issue_d, "%b-%Y"))
loan$gen_issue_year <- as.character(format(loan$gen_issue_d, "%Y"))
loan$gen_issue_mon <- as.character(format(loan$gen_issue_d, "%m"))

48. initial_list_status

initial_list_status : The initial listing status of the loan. Possible values are – W, F

In [116]:
unique(loan$initial_list_status)

t.test(int_rate ~ initial_list_status, data = loan, na.action = na.omit) # significant
  1. 'f'
  2. 'w'
	Welch Two Sample t-test

data:  int_rate by initial_list_status
t = 109.03, df = 885520, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.9895826 1.0258105
sample estimates:
mean in group f mean in group w 
       13.73565        12.72795 
In [117]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=factor(initial_list_status), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='number of payments on the loan') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

49. loan_amnt

loan_amnt : The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.

In [118]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=loan_amnt, y= ..density..)) + 
geom_density()
In [119]:
ggplot(loan, aes(x= log(loan_amnt), y=int_rate)) + geom_hex(bins=50) + stat_smooth(method=lm)
In [120]:
loan$gen_log_loan_amnt = log(loan$loan_amnt)

50. installment

installment : The monthly payment owed by the borrower if the loan originates.

In [121]:
ggplot(loan, aes(x= installment, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon") + 
stat_smooth(method=lm)
In [122]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=installment, y= ..density..)) + 
geom_density()

51. funded_amnt

funded_amnt : The total amount committed to that loan at that point in time.

In [123]:
ggplot(loan, aes(x= funded_amnt, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon") + 
stat_smooth(method=lm)
In [124]:
options(repr.plot.width=8, repr.plot.height=4)
ggplot(loan, aes(x=funded_amnt, y= ..density..)) + 
geom_density()
In [125]:
# only keep one is enough
cor(loan$funded_amnt, loan$funded_amnt_inv)
0.998025088623687

52. last_pymnt_amnt

last_pymnt_amnt : Last total payment amount received.

In [126]:
ggplot(loan, aes(x= last_pymnt_amnt, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon") 
# stat_smooth(method=lm)

53. last_pymnt_d

last_pymnt_d : Last month payment was received

In [127]:
loan$last_pymnt_d_date = as.Date(as.yearmon(loan$last_pymnt_d, "%b-%Y"))

options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(last_pymnt_d_date), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='emp_length_sim') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

54. next_pymnt_d

next_pymnt_d : Next scheduled payment date

In [128]:
loan$next_pymnt_d_date = as.Date(as.yearmon(loan$next_pymnt_d, "%b-%Y"))
options(repr.plot.width=20, repr.plot.height=8)
ggplot(loan, aes(x=factor(next_pymnt_d_date), y=int_rate)) + 
geom_boxplot(color='steelblue') + labs(x='emp_length_sim') + 
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none")

55. pymnt_plan

pymnt_plan : Indicates if a payment plan has been put in place for the loan

In [129]:
# basic no use
table(loan$pymnt_plan)
     n      y 
887369     10 

56. recoveries

recoveries : post charge off gross recovery

In [130]:
options(repr.plot.width=4, repr.plot.height=4)
ggplot(loan, aes(x=recoveries, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)

57. total_pymnt

total_pymnt : Payments received to date for total amount funded

In [131]:
ggplot(loan, aes(x= total_pymnt, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon")
In [132]:
# only keep one is enough
cor(loan$total_pymnt, loan$total_pymnt_inv)
0.99759231564028

58. total_rec_int

total_rec_int : Interest received to date

In [133]:
ggplot(loan, aes(x= total_rec_int, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon")

59. total_rec_late_fee

total_rec_late_fee : Late fees received to date

In [134]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=total_rec_late_fee, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)
In [135]:
cor(loan$total_rec_late_fee, loan$int_rate)
0.0571501214920043

60. total_rec_prncp

total_rec_prncp : Principal received to date

In [136]:
ggplot(loan, aes(x= total_rec_prncp, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon")

61. collection_recovery_fee

collection_recovery_fee : post charge off collection fee

In [137]:
options(repr.plot.width=8, repr.plot.height=8)
ggplot(loan, aes(x=collection_recovery_fee, y=int_rate)) + geom_hex(bins=50) #+ stat_smooth(method=lm)

62. out_prncp

out_prncp : Remaining outstanding principal for total amount funded

In [138]:
ggplot(loan, aes(x=out_prncp, y=int_rate)) + 
stat_density_2d(aes(fill = ..level..), geom = "polygon")
In [139]:
# keep one is enough
cor(loan$out_prncp, loan$out_prncp_inv)
0.999997246563639

Correlation plot for numerical features

In [140]:
num_var=c("int_rate","annual_inc","annual_inc_joint","dti",'total_acc','tot_cur_bal','open_acc','open_acc_6m',
          'acc_now_delinq','delinq_2yrs','mths_since_last_delinq','collections_12_mths_ex_med','tot_coll_amt',
          'pub_rec','mths_since_last_major_derog','mths_since_last_record','il_util','mths_since_rcnt_il',
          'open_il_12m','open_il_24m','open_il_6m','total_bal_il','open_rv_12m','revol_bal','revol_util',
          'loan_amnt','installment','funded_amnt','funded_amnt_inv','last_pymnt_amnt','recoveries','total_pymnt',
          'total_pymnt_inv','total_rec_int','total_rec_late_fee','total_rec_prncp','collection_recovery_fee',
          'out_prncp','out_prncp_inv')
#Plot the correlation matrix
library(corrplot)
correlations <- cor(loan[, num_var],use="complete.obs")
#print(correlations)
corrplot(correlations, method = "square", tl.cex = 1, type = 'lower')
corrplot 0.84 loaded
Warning message in cor(loan[, num_var], use = "complete.obs"):
“the standard deviation is zero”

missing value treatment

•   Remove features with too many missing value, or remove all rows with NA if you have a lot of data
•   If not missing at random, add new level to represent NA, impute with 0, or generate new feature.
•   If missing at random, imputation using summary stats or modeling way.
In [141]:
colnames(loan)
  1. 'id'
  2. 'member_id'
  3. 'loan_amnt'
  4. 'funded_amnt'
  5. 'funded_amnt_inv'
  6. 'term'
  7. 'int_rate'
  8. 'installment'
  9. 'grade'
  10. 'sub_grade'
  11. 'emp_title'
  12. 'emp_length'
  13. 'home_ownership'
  14. 'annual_inc'
  15. 'verification_status'
  16. 'issue_d'
  17. 'loan_status'
  18. 'pymnt_plan'
  19. 'url'
  20. 'desc'
  21. 'purpose'
  22. 'title'
  23. 'zip_code'
  24. 'addr_state'
  25. 'dti'
  26. 'delinq_2yrs'
  27. 'earliest_cr_line'
  28. 'inq_last_6mths'
  29. 'mths_since_last_delinq'
  30. 'mths_since_last_record'
  31. 'open_acc'
  32. 'pub_rec'
  33. 'revol_bal'
  34. 'revol_util'
  35. 'total_acc'
  36. 'initial_list_status'
  37. 'out_prncp'
  38. 'out_prncp_inv'
  39. 'total_pymnt'
  40. 'total_pymnt_inv'
  41. 'total_rec_prncp'
  42. 'total_rec_int'
  43. 'total_rec_late_fee'
  44. 'recoveries'
  45. 'collection_recovery_fee'
  46. 'last_pymnt_d'
  47. 'last_pymnt_amnt'
  48. 'next_pymnt_d'
  49. 'last_credit_pull_d'
  50. 'collections_12_mths_ex_med'
  51. 'mths_since_last_major_derog'
  52. 'policy_code'
  53. 'application_type'
  54. 'annual_inc_joint'
  55. 'dti_joint'
  56. 'verification_status_joint'
  57. 'acc_now_delinq'
  58. 'tot_coll_amt'
  59. 'tot_cur_bal'
  60. 'open_acc_6m'
  61. 'open_il_6m'
  62. 'open_il_12m'
  63. 'open_il_24m'
  64. 'mths_since_rcnt_il'
  65. 'total_bal_il'
  66. 'il_util'
  67. 'open_rv_12m'
  68. 'open_rv_24m'
  69. 'max_bal_bc'
  70. 'all_util'
  71. 'total_rev_hi_lim'
  72. 'inq_fi'
  73. 'total_cu_tl'
  74. 'inq_last_12m'
  75. 'earliest_cr_line_date'
  76. 'earliest_cr_line_year'
  77. 'earliest_cr_line_mon'
  78. 'last_credit_pull_d_date'
  79. 'last_credit_pull_d_year'
  80. 'last_credit_pull_d_mon'
  81. 'issue_d_date'
  82. 'issue_d_year'
  83. 'issue_d_mon'
  84. 'last_pymnt_d_date'
  85. 'last_pymnt_d_year'
  86. 'last_pymnt_d_mon'
  87. 'next_pymnt_d_date'
  88. 'next_pymnt_d_year'
  89. 'next_pymnt_d_mon'
  90. 'emp_length_sim'
  91. 'gen_home_ownership'
  92. 'gen_add_state_zip'
  93. 'gen_add_state_zip_ave_rate'
  94. 'gen_log_annual_inc'
  95. 'gen_inq_last_6mths'
  96. 'gen_log_tot_cur_bal'
  97. 'gen_log_tot_coll_amt'
  98. 'gen_log_total_rev_hi_lim'
  99. 'gen_issue_d'
  100. 'gen_issue_year'
  101. 'gen_issue_mon'
  102. 'gen_log_loan_amnt'
In [142]:
num.NA <- sort(colSums(sapply(loan, is.na)))

dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
                       percentage = num.NA/nrow(loan),
                       per80 = num.NA/nrow(loan)>=0.2,
                           name = names(num.NA),
                       row.names = NULL) # convert to data.frame

options(repr.plot.width=8, repr.plot.height=4)
ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) + 
  geom_bar(aes(fill=per80), stat="identity") + 
  scale_x_discrete(name ="column names", 
                   limits=dfnum.NA$name)+
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none") +
  geom_hline(yintercept = 0.2) + 
  ggtitle("percentage of missing")
In [143]:
loan_cp=loan
# drop missing > 80%
dfnum.NA$name[dfnum.NA$per80]
loan[, as.character(dfnum.NA$name[dfnum.NA$per80])] <- NULL
  1. next_pymnt_d_date
  2. next_pymnt_d_year
  3. next_pymnt_d_mon
  4. mths_since_last_delinq
  5. mths_since_last_major_derog
  6. mths_since_last_record
  7. open_acc_6m
  8. open_il_6m
  9. open_il_12m
  10. open_il_24m
  11. total_bal_il
  12. open_rv_12m
  13. open_rv_24m
  14. max_bal_bc
  15. all_util
  16. inq_fi
  17. total_cu_tl
  18. inq_last_12m
  19. mths_since_rcnt_il
  20. il_util
  21. annual_inc_joint
  22. dti_joint
In [144]:
num.NA <- sort(colSums(sapply(loan, is.na)))

dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
                       percentage = num.NA/nrow(loan),
                       per80 = num.NA/nrow(loan)>=0.2,
                           name = names(num.NA),
                       row.names = NULL) # convert to data.frame

options(repr.plot.width=8, repr.plot.height=4)
ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) + 
  geom_bar(aes(fill=per80), stat="identity") + 
  scale_x_discrete(name ="column names", 
                   limits=dfnum.NA$name)+
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none") +
  geom_hline(yintercept = 0.2) + 
  ggtitle("percentage of missing")
In [145]:
num.NA <- sort(sapply(loan, function(x) { sum(is.na(x))} ), decreasing = TRUE)
num.NA
tot_coll_amt
70276
tot_cur_bal
70276
total_rev_hi_lim
70276
gen_log_tot_cur_bal
70276
gen_log_tot_coll_amt
70276
gen_log_total_rev_hi_lim
70276
emp_title
51457
last_pymnt_d_date
17659
last_pymnt_d_year
17659
last_pymnt_d_mon
17659
revol_util
502
collections_12_mths_ex_med
145
last_credit_pull_d_date
53
last_credit_pull_d_year
53
last_credit_pull_d_mon
53
delinq_2yrs
29
inq_last_6mths
29
open_acc
29
pub_rec
29
total_acc
29
acc_now_delinq
29
earliest_cr_line_date
29
earliest_cr_line_year
29
earliest_cr_line_mon
29
gen_inq_last_6mths
29
id
0
member_id
0
loan_amnt
0
funded_amnt
0
funded_amnt_inv
0
term
0
int_rate
0
installment
0
grade
0
sub_grade
0
emp_length
0
home_ownership
0
annual_inc
0
verification_status
0
issue_d
0
loan_status
0
pymnt_plan
0
url
0
desc
0
purpose
0
title
0
zip_code
0
addr_state
0
dti
0
earliest_cr_line
0
revol_bal
0
initial_list_status
0
out_prncp
0
out_prncp_inv
0
total_pymnt
0
total_pymnt_inv
0
total_rec_prncp
0
total_rec_int
0
total_rec_late_fee
0
recoveries
0
collection_recovery_fee
0
last_pymnt_d
0
last_pymnt_amnt
0
next_pymnt_d
0
last_credit_pull_d
0
policy_code
0
application_type
0
verification_status_joint
0
issue_d_date
0
issue_d_year
0
issue_d_mon
0
emp_length_sim
0
gen_home_ownership
0
gen_add_state_zip
0
gen_add_state_zip_ave_rate
0
gen_log_annual_inc
0
gen_issue_d
0
gen_issue_year
0
gen_issue_mon
0
gen_log_loan_amnt
0
In [146]:
for(col.i in names(num.NA)[which(num.NA > 0)]) {
  loan[which(is.na(loan[, col.i])), col.i] <- median(loan[, col.i], na.rm = T)}
Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”Warning message in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]):
“argument is not numeric or logical: returning NA”

feature variance¶

In [147]:
apply(loan, 2, function(x){length(unique(x))})
id
887379
member_id
887379
loan_amnt
1372
funded_amnt
1372
funded_amnt_inv
9729
term
2
int_rate
542
installment
68711
grade
7
sub_grade
35
emp_title
261562
emp_length
10
home_ownership
6
annual_inc
49384
verification_status
3
issue_d
103
loan_status
10
pymnt_plan
2
url
887379
desc
124471
purpose
14
title
53298
zip_code
935
addr_state
51
dti
4086
delinq_2yrs
29
earliest_cr_line
698
inq_last_6mths
28
open_acc
77
pub_rec
32
revol_bal
73740
revol_util
1356
total_acc
135
initial_list_status
2
out_prncp
248332
out_prncp_inv
266244
total_pymnt
503828
total_pymnt_inv
506616
total_rec_prncp
260227
total_rec_int
324635
total_rec_late_fee
6114
recoveries
23055
collection_recovery_fee
20414
last_pymnt_d
99
last_pymnt_amnt
232451
next_pymnt_d
101
last_credit_pull_d
104
collections_12_mths_ex_med
12
policy_code
1
application_type
2
verification_status_joint
4
acc_now_delinq
8
tot_coll_amt
10325
tot_cur_bal
327342
total_rev_hi_lim
21251
earliest_cr_line_date
697
earliest_cr_line_year
68
earliest_cr_line_mon
13
last_credit_pull_d_date
103
last_credit_pull_d_year
11
last_credit_pull_d_mon
13
issue_d_date
103
issue_d_year
9
issue_d_mon
12
last_pymnt_d_date
98
last_pymnt_d_year
11
last_pymnt_d_mon
13
emp_length_sim
2
gen_home_ownership
4
gen_add_state_zip
1985
gen_add_state_zip_ave_rate
353
gen_log_annual_inc
49249
gen_inq_last_6mths
6
gen_log_tot_cur_bal
327342
gen_log_tot_coll_amt
10325
gen_log_total_rev_hi_lim
21251
gen_issue_d
103
gen_issue_year
9
gen_issue_mon
12
gen_log_loan_amnt
1372
In [148]:
# remove all unique and single
loan[, c("id", "member_id", "url", "desc")] <- NULL
In [149]:
# drop loan payment features
loan[, c("installment", "funded_amnt", "funded_amnt_inv",
         "last_pymnt_amnt", "last_pymnt_d", "next_pymnt_d", 'gen_add_state_zip',
         "pymnt_plan", "recoveries", "total_pymnt", "verification_status", 'verification_status_joint',
         "total_pymnt_inv", "total_rec_int", "total_rec_late_fee",
         "total_rec_prncp", "collection_recovery_fee", "out_prncp", "out_prncp_inv")] <- NULL
In [150]:
# drop potential response variables
loan[, c("grade", "sub_grade", "loan_status")] <- NULL
In [151]:
colnames(loan)
  1. 'loan_amnt'
  2. 'term'
  3. 'int_rate'
  4. 'emp_title'
  5. 'emp_length'
  6. 'home_ownership'
  7. 'annual_inc'
  8. 'issue_d'
  9. 'purpose'
  10. 'title'
  11. 'zip_code'
  12. 'addr_state'
  13. 'dti'
  14. 'delinq_2yrs'
  15. 'earliest_cr_line'
  16. 'inq_last_6mths'
  17. 'open_acc'
  18. 'pub_rec'
  19. 'revol_bal'
  20. 'revol_util'
  21. 'total_acc'
  22. 'initial_list_status'
  23. 'last_credit_pull_d'
  24. 'collections_12_mths_ex_med'
  25. 'policy_code'
  26. 'application_type'
  27. 'acc_now_delinq'
  28. 'tot_coll_amt'
  29. 'tot_cur_bal'
  30. 'total_rev_hi_lim'
  31. 'earliest_cr_line_date'
  32. 'earliest_cr_line_year'
  33. 'earliest_cr_line_mon'
  34. 'last_credit_pull_d_date'
  35. 'last_credit_pull_d_year'
  36. 'last_credit_pull_d_mon'
  37. 'issue_d_date'
  38. 'issue_d_year'
  39. 'issue_d_mon'
  40. 'last_pymnt_d_date'
  41. 'last_pymnt_d_year'
  42. 'last_pymnt_d_mon'
  43. 'emp_length_sim'
  44. 'gen_home_ownership'
  45. 'gen_add_state_zip_ave_rate'
  46. 'gen_log_annual_inc'
  47. 'gen_inq_last_6mths'
  48. 'gen_log_tot_cur_bal'
  49. 'gen_log_tot_coll_amt'
  50. 'gen_log_total_rev_hi_lim'
  51. 'gen_issue_d'
  52. 'gen_issue_year'
  53. 'gen_issue_mon'
  54. 'gen_log_loan_amnt'
In [152]:
# drop "zip_code", "addr_state", "earliest_cr_line", "policy_code"
loan[, c("zip_code", "addr_state", "policy_code", "earliest_cr_line")] <- NULL
In [156]:
# save the updated data
saveRDS(loan, file="loan_updated.Rda")

build model¶

Linear model

Model selection

In [179]:
# split data into train and test for model performance
set.seed(1)
loan.complete = loan[complete.cases(loan), ]
train.ind <- sample(1:dim(loan.complete)[1], 0.7 * dim(loan.complete)[1])
train <- loan.complete[train.ind, ]
test <- loan.complete[-train.ind, ]
In [180]:
mod <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + 
          inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc + 
          initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq + 
          tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
          gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths + 
          issue_d_date + gen_issue_year + gen_issue_mon
          ,data = train)
summary(mod)
Call:
lm(formula = int_rate ~ loan_amnt + term + annual_inc + purpose + 
    dti + delinq_2yrs + inq_last_6mths + open_acc + pub_rec + 
    revol_bal + revol_util + total_acc + initial_list_status + 
    collections_12_mths_ex_med + application_type + acc_now_delinq + 
    tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
    gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths + 
    issue_d_date + gen_issue_year + gen_issue_mon, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-30.456  -2.269  -0.272   2.001 155.436 

Coefficients:
                             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)                -2.032e+04  7.773e+02  -26.144  < 2e-16 ***
loan_amnt                   3.997e-05  6.277e-07   63.671  < 2e-16 ***
term 60 months              4.040e+00  1.031e-02  391.693  < 2e-16 ***
annual_inc                 -2.611e-06  7.245e-08  -36.037  < 2e-16 ***
purposecredit_card         -1.339e+00  4.376e-02  -30.605  < 2e-16 ***
purposedebt_consolidation   1.578e-01  4.316e-02    3.657 0.000256 ***
purposeeducational          6.860e-01  1.941e-01    3.534 0.000409 ***
purposehome_improvement     5.415e-01  4.628e-02   11.702  < 2e-16 ***
purposehouse                2.967e+00  7.816e-02   37.958  < 2e-16 ***
purposemajor_purchase       4.345e-01  5.233e-02    8.303  < 2e-16 ***
purposemedical              2.169e+00  6.100e-02   35.554  < 2e-16 ***
purposemoving               3.270e+00  6.928e-02   47.196  < 2e-16 ***
purposeother                2.376e+00  4.683e-02   50.742  < 2e-16 ***
purposerenewable_energy     3.142e+00  1.695e-01   18.544  < 2e-16 ***
purposesmall_business       3.462e+00  5.893e-02   58.741  < 2e-16 ***
purposevacation             2.288e+00  7.216e-02   31.713  < 2e-16 ***
purposewedding              1.590e+00  9.126e-02   17.421  < 2e-16 ***
dti                         5.088e-02  5.892e-04   86.361  < 2e-16 ***
delinq_2yrs                 3.225e-01  5.003e-03   64.463  < 2e-16 ***
inq_last_6mths             -3.240e-01  3.569e-02   -9.076  < 2e-16 ***
open_acc                    6.485e-02  1.178e-03   55.070  < 2e-16 ***
pub_rec                     5.091e-01  7.751e-03   65.678  < 2e-16 ***
revol_bal                   9.373e-06  3.766e-07   24.890  < 2e-16 ***
revol_util                  4.130e-02  2.195e-04  188.176  < 2e-16 ***
total_acc                  -4.095e-02  5.226e-04  -78.367  < 2e-16 ***
initial_list_statusw       -9.458e-01  9.427e-03 -100.328  < 2e-16 ***
collections_12_mths_ex_med  6.244e-01  3.258e-02   19.168  < 2e-16 ***
application_typeJOINT       8.973e-01  2.115e-01    4.242 2.22e-05 ***
acc_now_delinq              1.552e+00  5.487e-02   28.284  < 2e-16 ***
tot_coll_amt                1.176e-06  3.464e-07    3.394 0.000689 ***
tot_cur_bal                -1.967e-06  3.832e-08  -51.345  < 2e-16 ***
total_rev_hi_lim           -1.725e-05  2.271e-07  -75.938  < 2e-16 ***
emp_length_sim>= 5         -7.657e-02  8.918e-03   -8.585  < 2e-16 ***
gen_home_ownershipothers    1.189e+00  2.664e-01    4.465 8.01e-06 ***
gen_home_ownershipOWN       5.142e-01  1.554e-02   33.099  < 2e-16 ***
gen_home_ownershipRENT      6.435e-01  1.068e-02   60.268  < 2e-16 ***
gen_add_state_zip_ave_rate  4.850e-01  1.108e-02   43.781  < 2e-16 ***
gen_inq_last_6mths          1.384e+00  3.646e-02   37.968  < 2e-16 ***
issue_d_date                1.504e+00  5.752e-02   26.145  < 2e-16 ***
gen_issue_year2008         -5.499e+02  2.104e+01  -26.136  < 2e-16 ***
gen_issue_year2009         -1.098e+03  4.205e+01  -26.122  < 2e-16 ***
gen_issue_year2010         -1.649e+03  6.304e+01  -26.150  < 2e-16 ***
gen_issue_year2011         -2.197e+03  8.404e+01  -26.147  < 2e-16 ***
gen_issue_year2012         -2.746e+03  1.051e+02  -26.128  < 2e-16 ***
gen_issue_year2013         -3.294e+03  1.261e+02  -26.123  < 2e-16 ***
gen_issue_year2014         -3.843e+03  1.471e+02  -26.131  < 2e-16 ***
gen_issue_year2015         -4.393e+03  1.681e+02  -26.139  < 2e-16 ***
gen_issue_mon02            -4.673e+01  1.783e+00  -26.213  < 2e-16 ***
gen_issue_mon03            -8.888e+01  3.397e+00  -26.169  < 2e-16 ***
gen_issue_mon04            -1.354e+02  5.180e+00  -26.134  < 2e-16 ***
gen_issue_mon05            -1.805e+02  6.905e+00  -26.142  < 2e-16 ***
gen_issue_mon06            -2.271e+02  8.688e+00  -26.143  < 2e-16 ***
gen_issue_mon07            -2.723e+02  1.041e+01  -26.146  < 2e-16 ***
gen_issue_mon08            -3.190e+02  1.220e+01  -26.150  < 2e-16 ***
gen_issue_mon09            -3.657e+02  1.398e+01  -26.155  < 2e-16 ***
gen_issue_mon10            -4.109e+02  1.571e+01  -26.161  < 2e-16 ***
gen_issue_mon11            -4.576e+02  1.749e+01  -26.168  < 2e-16 ***
gen_issue_mon12            -5.027e+02  1.921e+01  -26.164  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.21 on 573546 degrees of freedom
Multiple R-squared:  0.4642,	Adjusted R-squared:  0.4641 
F-statistic:  8716 on 57 and 573546 DF,  p-value: < 2.2e-16
In [199]:
#plot(mod)
In [183]:
# remove extreme outliers
train.sub = train[-which(rownames(train) %in% c(534801, 755747, 301371)), ]
In [184]:
mod2 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + 
          inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc + 
          initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq + 
          tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
          gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths + 
          issue_d_date + gen_issue_year + gen_issue_mon
          ,data = train.sub)
summary(mod2)
Call:
lm(formula = int_rate ~ loan_amnt + term + annual_inc + purpose + 
    dti + delinq_2yrs + inq_last_6mths + open_acc + pub_rec + 
    revol_bal + revol_util + total_acc + initial_list_status + 
    collections_12_mths_ex_med + application_type + acc_now_delinq + 
    tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
    gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths + 
    issue_d_date + gen_issue_year + gen_issue_mon, data = train.sub)

Residuals:
    Min      1Q  Median      3Q     Max 
-30.365  -2.262  -0.268   1.994 205.353 

Coefficients:
                             Estimate Std. Error t value Pr(>|t|)    
(Intercept)                -2.043e+04  7.752e+02 -26.354  < 2e-16 ***
loan_amnt                   4.263e-05  6.278e-07  67.907  < 2e-16 ***
term 60 months              4.026e+00  1.029e-02 391.343  < 2e-16 ***
annual_inc                 -2.579e-06  7.226e-08 -35.694  < 2e-16 ***
purposecredit_card         -1.328e+00  4.364e-02 -30.428  < 2e-16 ***
purposedebt_consolidation   1.577e-01  4.305e-02   3.663 0.000250 ***
purposeeducational          7.168e-01  1.936e-01   3.703 0.000213 ***
purposehome_improvement     5.357e-01  4.615e-02  11.608  < 2e-16 ***
purposehouse                2.954e+00  7.795e-02  37.890  < 2e-16 ***
purposemajor_purchase       4.369e-01  5.219e-02   8.372  < 2e-16 ***
purposemedical              2.170e+00  6.083e-02  35.673  < 2e-16 ***
purposemoving               3.270e+00  6.910e-02  47.326  < 2e-16 ***
purposeother                2.376e+00  4.670e-02  50.873  < 2e-16 ***
purposerenewable_energy     3.145e+00  1.690e-01  18.611  < 2e-16 ***
purposesmall_business       3.453e+00  5.878e-02  58.753  < 2e-16 ***
purposevacation             2.292e+00  7.196e-02  31.845  < 2e-16 ***
purposewedding              1.601e+00  9.101e-02  17.588  < 2e-16 ***
dti                         5.067e-02  5.876e-04  86.224  < 2e-16 ***
delinq_2yrs                 3.136e-01  4.992e-03  62.822  < 2e-16 ***
inq_last_6mths             -3.224e-01  3.560e-02  -9.058  < 2e-16 ***
open_acc                    7.058e-02  1.179e-03  59.874  < 2e-16 ***
pub_rec                     4.910e-01  7.736e-03  63.471  < 2e-16 ***
revol_bal                   1.587e-05  3.931e-07  40.371  < 2e-16 ***
revol_util                  3.888e-02  2.231e-04 174.252  < 2e-16 ***
total_acc                  -4.115e-02  5.212e-04 -78.957  < 2e-16 ***
initial_list_statusw       -9.379e-01  9.403e-03 -99.746  < 2e-16 ***
collections_12_mths_ex_med  6.099e-01  3.249e-02  18.773  < 2e-16 ***
application_typeJOINT       8.668e-01  2.110e-01   4.109 3.98e-05 ***
acc_now_delinq              1.565e+00  5.472e-02  28.604  < 2e-16 ***
tot_coll_amt                1.131e-06  3.455e-07   3.273 0.001062 ** 
tot_cur_bal                -1.886e-06  3.824e-08 -49.324  < 2e-16 ***
total_rev_hi_lim           -2.402e-05  2.569e-07 -93.508  < 2e-16 ***
emp_length_sim>= 5         -6.922e-02  8.895e-03  -7.782 7.17e-15 ***
gen_home_ownershipothers    1.202e+00  2.657e-01   4.523 6.11e-06 ***
gen_home_ownershipOWN       5.262e-01  1.550e-02  33.958  < 2e-16 ***
gen_home_ownershipRENT      6.416e-01  1.065e-02  60.259  < 2e-16 ***
gen_add_state_zip_ave_rate  4.763e-01  1.105e-02  43.108  < 2e-16 ***
gen_inq_last_6mths          1.380e+00  3.636e-02  37.962  < 2e-16 ***
issue_d_date                1.512e+00  5.736e-02  26.355  < 2e-16 ***
gen_issue_year2008         -5.528e+02  2.098e+01 -26.346  < 2e-16 ***
gen_issue_year2009         -1.104e+03  4.193e+01 -26.332  < 2e-16 ***
gen_issue_year2010         -1.657e+03  6.287e+01 -26.359  < 2e-16 ***
gen_issue_year2011         -2.209e+03  8.381e+01 -26.357  < 2e-16 ***
gen_issue_year2012         -2.760e+03  1.048e+02 -26.337  < 2e-16 ***
gen_issue_year2013         -3.311e+03  1.257e+02 -26.332  < 2e-16 ***
gen_issue_year2014         -3.864e+03  1.467e+02 -26.341  < 2e-16 ***
gen_issue_year2015         -4.416e+03  1.676e+02 -26.348  < 2e-16 ***
gen_issue_mon02            -4.698e+01  1.778e+00 -26.422  < 2e-16 ***
gen_issue_mon03            -8.935e+01  3.387e+00 -26.379  < 2e-16 ***
gen_issue_mon04            -1.361e+02  5.166e+00 -26.344  < 2e-16 ***
gen_issue_mon05            -1.815e+02  6.886e+00 -26.351  < 2e-16 ***
gen_issue_mon06            -2.283e+02  8.665e+00 -26.352  < 2e-16 ***
gen_issue_mon07            -2.737e+02  1.039e+01 -26.356  < 2e-16 ***
gen_issue_mon08            -3.206e+02  1.216e+01 -26.359  < 2e-16 ***
gen_issue_mon09            -3.676e+02  1.394e+01 -26.364  < 2e-16 ***
gen_issue_mon10            -4.130e+02  1.566e+01 -26.370  < 2e-16 ***
gen_issue_mon11            -4.601e+02  1.744e+01 -26.377  < 2e-16 ***
gen_issue_mon12            -5.054e+02  1.916e+01 -26.373  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.201 on 573545 degrees of freedom
Multiple R-squared:  0.4671,	Adjusted R-squared:  0.467 
F-statistic:  8818 on 57 and 573545 DF,  p-value: < 2.2e-16
In [186]:
train.sub2 = train.sub[-which(rownames(train.sub) %in% c(52576)), ]
mod3 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + I(loan_amnt^2) +
           inq_last_6mths + open_acc + pub_rec + revol_bal + revol_util + total_acc +  sqrt(annual_inc) +
           initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq + 
           tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
           gen_home_ownership + gen_add_state_zip_ave_rate + gen_inq_last_6mths + 
           issue_d_date
           ,data = train.sub2)
summary(mod3)
#plot(mod3)
Call:
lm(formula = int_rate ~ loan_amnt + term + annual_inc + purpose + 
    dti + delinq_2yrs + I(loan_amnt^2) + inq_last_6mths + open_acc + 
    pub_rec + revol_bal + revol_util + total_acc + sqrt(annual_inc) + 
    initial_list_status + collections_12_mths_ex_med + application_type + 
    acc_now_delinq + tot_coll_amt + tot_cur_bal + total_rev_hi_lim + 
    emp_length_sim + gen_home_ownership + gen_add_state_zip_ave_rate + 
    gen_inq_last_6mths + issue_d_date, data = train.sub2)

Residuals:
    Min      1Q  Median      3Q     Max 
-31.186  -2.247  -0.241   1.985 194.366 

Coefficients:
                             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)                 1.016e+01  2.232e-01   45.519  < 2e-16 ***
loan_amnt                  -1.445e-04  2.107e-06  -68.561  < 2e-16 ***
term 60 months              4.020e+00  1.046e-02  384.376  < 2e-16 ***
annual_inc                  6.710e-06  1.178e-07   56.943  < 2e-16 ***
purposecredit_card         -8.300e-01  4.387e-02  -18.920  < 2e-16 ***
purposedebt_consolidation   6.476e-01  4.327e-02   14.966  < 2e-16 ***
purposeeducational         -9.204e-01  1.922e-01   -4.788 1.69e-06 ***
purposehome_improvement     9.812e-01  4.636e-02   21.164  < 2e-16 ***
purposehouse                3.395e+00  7.832e-02   43.350  < 2e-16 ***
purposemajor_purchase       6.449e-01  5.242e-02   12.303  < 2e-16 ***
purposemedical              2.357e+00  6.110e-02   38.584  < 2e-16 ***
purposemoving               3.369e+00  6.941e-02   48.542  < 2e-16 ***
purposeother                2.555e+00  4.689e-02   54.493  < 2e-16 ***
purposerenewable_energy     3.122e+00  1.698e-01   18.390  < 2e-16 ***
purposesmall_business       3.697e+00  5.906e-02   62.596  < 2e-16 ***
purposevacation             2.330e+00  7.232e-02   32.213  < 2e-16 ***
purposewedding              1.552e+00  9.139e-02   16.982  < 2e-16 ***
dti                         2.963e-02  6.196e-04   47.830  < 2e-16 ***
delinq_2yrs                 3.276e-01  5.016e-03   65.313  < 2e-16 ***
I(loan_amnt^2)              5.957e-09  5.484e-11  108.620  < 2e-16 ***
inq_last_6mths             -6.090e-01  3.482e-02  -17.488  < 2e-16 ***
open_acc                    7.999e-02  1.185e-03   67.500  < 2e-16 ***
pub_rec                     5.204e-01  7.767e-03   66.993  < 2e-16 ***
revol_bal                   1.378e-05  3.961e-07   34.779  < 2e-16 ***
revol_util                  4.412e-02  2.240e-04  196.992  < 2e-16 ***
total_acc                  -3.184e-02  5.283e-04  -60.278  < 2e-16 ***
sqrt(annual_inc)           -1.378e-02  1.299e-04 -106.051  < 2e-16 ***
initial_list_statusw       -9.455e-01  9.337e-03 -101.267  < 2e-16 ***
collections_12_mths_ex_med  5.607e-01  3.263e-02   17.181  < 2e-16 ***
application_typeJOINT       1.795e-02  2.118e-01    0.085 0.932444    
acc_now_delinq              1.646e+00  5.498e-02   29.936  < 2e-16 ***
tot_coll_amt                1.332e-06  3.471e-07    3.838 0.000124 ***
tot_cur_bal                -7.641e-07  3.966e-08  -19.264  < 2e-16 ***
total_rev_hi_lim           -2.258e-05  2.587e-07  -87.280  < 2e-16 ***
emp_length_sim>= 5          6.424e-02  8.933e-03    7.192 6.39e-13 ***
gen_home_ownershipothers    3.447e-01  2.657e-01    1.297 0.194634    
gen_home_ownershipOWN       4.518e-01  1.557e-02   29.017  < 2e-16 ***
gen_home_ownershipRENT      6.140e-01  1.070e-02   57.369  < 2e-16 ***
gen_add_state_zip_ave_rate  4.126e-01  1.113e-02   37.078  < 2e-16 ***
gen_inq_last_6mths          1.702e+00  3.561e-02   47.807  < 2e-16 ***
issue_d_date               -2.336e-04  1.045e-05  -22.349  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.216 on 573561 degrees of freedom
Multiple R-squared:  0.4621,	Adjusted R-squared:  0.4621 
F-statistic: 1.232e+04 on 40 and 573561 DF,  p-value: < 2.2e-16
In [187]:
library(car)
vif(mod3)
Warning message:
“package ‘car’ was built under R version 3.4.3”
GVIFDfGVIF^(1/(2*Df))
loan_amnt17.504391 1 4.183825
term 1.290695 1 1.136088
annual_inc 3.542260 1 1.882089
purpose 1.24579213 1.008489
dti 1.453904 1 1.205780
delinq_2yrs 1.055798 1 1.027520
I(loan_amnt^2)15.978649 1 3.997330
inq_last_6mths67.268650 1 8.201747
open_acc 2.208085 1 1.485963
pub_rec 1.051256 1 1.025308
revol_bal 4.221111 1 2.054534
revol_util 1.571167 1 1.253462
total_acc 2.169096 1 1.472785
sqrt(annual_inc) 5.284394 1 2.298781
initial_list_status 1.206664 1 1.098483
collections_12_mths_ex_med 1.010491 1 1.005232
application_type 1.001376 1 1.000688
acc_now_delinq 1.020319 1 1.010108
tot_coll_amt 1.000329 1 1.000165
tot_cur_bal 1.933958 1 1.390668
total_rev_hi_lim 4.175905 1 2.043503
emp_length_sim 1.052663 1 1.025994
gen_home_ownership 1.448458 3 1.063696
gen_add_state_zip_ave_rate 1.038719 1 1.019176
gen_inq_last_6mths67.176971 1 8.196156
issue_d_date 1.317008 1 1.147610
In [188]:
mod4 <- lm(int_rate ~ loan_amnt + term + annual_inc + purpose + dti + delinq_2yrs + I(loan_amnt^2) +
        open_acc + pub_rec + revol_bal + revol_util + total_acc +  sqrt(annual_inc) +
           initial_list_status + collections_12_mths_ex_med + application_type + acc_now_delinq + 
           tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
           gen_home_ownership + gen_add_state_zip_ave_rate + issue_d_date
           ,data = train.sub2)
summary(mod4)
#plot(mod3)
Call:
lm(formula = int_rate ~ loan_amnt + term + annual_inc + purpose + 
    dti + delinq_2yrs + I(loan_amnt^2) + open_acc + pub_rec + 
    revol_bal + revol_util + total_acc + sqrt(annual_inc) + initial_list_status + 
    collections_12_mths_ex_med + application_type + acc_now_delinq + 
    tot_coll_amt + tot_cur_bal + total_rev_hi_lim + emp_length_sim + 
    gen_home_ownership + gen_add_state_zip_ave_rate + issue_d_date, 
    data = train.sub2)

Residuals:
    Min      1Q  Median      3Q     Max 
-38.355  -2.393  -0.279   2.083 206.165 

Coefficients:
                             Estimate Std. Error  t value Pr(>|t|)    
(Intercept)                 1.470e+01  2.332e-01   63.062  < 2e-16 ***
loan_amnt                  -1.634e-04  2.211e-06  -73.889  < 2e-16 ***
term 60 months              4.089e+00  1.098e-02  372.422  < 2e-16 ***
annual_inc                  6.309e-06  1.237e-07   50.992  < 2e-16 ***
purposecredit_card         -8.247e-01  4.606e-02  -17.904  < 2e-16 ***
purposedebt_consolidation   6.889e-01  4.544e-02   15.160  < 2e-16 ***
purposeeducational         -5.870e-01  2.017e-01   -2.910 0.003615 ** 
purposehome_improvement     1.121e+00  4.868e-02   23.025  < 2e-16 ***
purposehouse                3.767e+00  8.222e-02   45.811  < 2e-16 ***
purposemajor_purchase       7.323e-01  5.504e-02   13.304  < 2e-16 ***
purposemedical              2.433e+00  6.416e-02   37.917  < 2e-16 ***
purposemoving               3.449e+00  7.288e-02   47.324  < 2e-16 ***
purposeother                2.611e+00  4.924e-02   53.036  < 2e-16 ***
purposerenewable_energy     3.224e+00  1.783e-01   18.084  < 2e-16 ***
purposesmall_business       3.944e+00  6.200e-02   63.613  < 2e-16 ***
purposevacation             2.396e+00  7.594e-02   31.553  < 2e-16 ***
purposewedding              1.621e+00  9.597e-02   16.889  < 2e-16 ***
dti                         2.952e-02  6.506e-04   45.379  < 2e-16 ***
delinq_2yrs                 3.390e-01  5.267e-03   64.353  < 2e-16 ***
I(loan_amnt^2)              6.243e-09  5.757e-11  108.437  < 2e-16 ***
open_acc                    9.380e-02  1.243e-03   75.472  < 2e-16 ***
pub_rec                     6.322e-01  8.142e-03   77.651  < 2e-16 ***
revol_bal                   1.396e-05  4.160e-07   33.558  < 2e-16 ***
revol_util                  4.140e-02  2.349e-04  176.253  < 2e-16 ***
total_acc                  -2.351e-02  5.536e-04  -42.471  < 2e-16 ***
sqrt(annual_inc)           -1.254e-02  1.363e-04  -92.009  < 2e-16 ***
initial_list_statusw       -1.021e+00  9.797e-03 -104.205  < 2e-16 ***
collections_12_mths_ex_med  6.208e-01  3.427e-02   18.117  < 2e-16 ***
application_typeJOINT       1.370e-01  2.224e-01    0.616 0.537766    
acc_now_delinq              1.506e+00  5.773e-02   26.081  < 2e-16 ***
tot_coll_amt                1.418e-06  3.645e-07    3.891 9.97e-05 ***
tot_cur_bal                -8.314e-07  4.165e-08  -19.961  < 2e-16 ***
total_rev_hi_lim           -2.370e-05  2.716e-07  -87.272  < 2e-16 ***
emp_length_sim>= 5          3.469e-02  9.379e-03    3.699 0.000217 ***
gen_home_ownershipothers    4.190e-01  2.791e-01    1.502 0.133209    
gen_home_ownershipOWN       4.489e-01  1.635e-02   27.451  < 2e-16 ***
gen_home_ownershipRENT      5.841e-01  1.124e-02   51.982  < 2e-16 ***
gen_add_state_zip_ave_rate  4.619e-01  1.168e-02   39.537  < 2e-16 ***
issue_d_date               -5.246e-04  1.087e-05  -48.271  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.377 on 573563 degrees of freedom
Multiple R-squared:  0.4069,	Adjusted R-squared:  0.4068 
F-statistic: 1.035e+04 on 38 and 573563 DF,  p-value: < 2.2e-16

Using glmnet to generate linear model

In [189]:
# split data into train and test for model performance
set.seed(1)
loan.complete = loan[complete.cases(loan), ]

selected_features = c("loan_amnt", "annual_inc", "dti", "delinq_2yrs", "open_acc",
                      "pub_rec", "revol_bal", "revol_util", "total_acc", 
                      "tot_coll_amt", "tot_cur_bal", "total_rev_hi_lim", "gen_annual_inc_sqrt", 
                      "gen_add_state_zip_ave_rate", "int_rate", "gen_loan_amnt_square")
# generate nonlinear features
loan.complete$gen_loan_amnt_square = loan.complete$loan_amnt^2
loan.complete$gen_annual_inc_sqrt = sqrt(loan.complete$annual_inc)

# generate dummy variables
dummy.variable = model.matrix(~purpose + term + gen_home_ownership + 
                              collections_12_mths_ex_med + 
                              acc_now_delinq + emp_length_sim - 1, loan.complete)

# numerical variable
num.variable = as.matrix(loan.complete[, selected_features])

# combine dummy with loan data
loan.sim = cbind(num.variable, dummy.variable)

train.ind <- sample(1:dim(loan.complete)[1], 0.7 * dim(loan.complete)[1])

train <- loan.sim[train.ind, ]
test <- loan.sim[-train.ind, ]
In [190]:
library(glmnet)
In [191]:
which(colnames(train)=='int_rate')
15
In [192]:
train.feature = train[, -15]
train.label = train[, 15]
test.feature = test[, -15]
test.label = test[, 15]

save(train.feature, file = "train_feature.Rdata")
save(train.label, file = "train_label.Rdata")
save(test.feature, file = "test_feature.Rdata")
save(test.label, file = "test_label.Rdata")
In [193]:
colnames(train.feature)
  1. 'loan_amnt'
  2. 'annual_inc'
  3. 'dti'
  4. 'delinq_2yrs'
  5. 'open_acc'
  6. 'pub_rec'
  7. 'revol_bal'
  8. 'revol_util'
  9. 'total_acc'
  10. 'tot_coll_amt'
  11. 'tot_cur_bal'
  12. 'total_rev_hi_lim'
  13. 'gen_annual_inc_sqrt'
  14. 'gen_add_state_zip_ave_rate'
  15. 'gen_loan_amnt_square'
  16. 'purposecar'
  17. 'purposecredit_card'
  18. 'purposedebt_consolidation'
  19. 'purposeeducational'
  20. 'purposehome_improvement'
  21. 'purposehouse'
  22. 'purposemajor_purchase'
  23. 'purposemedical'
  24. 'purposemoving'
  25. 'purposeother'
  26. 'purposerenewable_energy'
  27. 'purposesmall_business'
  28. 'purposevacation'
  29. 'purposewedding'
  30. 'term 60 months'
  31. 'gen_home_ownershipothers'
  32. 'gen_home_ownershipOWN'
  33. 'gen_home_ownershipRENT'
  34. 'collections_12_mths_ex_med'
  35. 'acc_now_delinq'
  36. 'emp_length_sim>= 5'
In [194]:
mod.glmnet <- glmnet(x=train.feature, y=train.label)
In [195]:
cvfit <- cv.glmnet(train.feature, train.label)
plot(cvfit)
In [177]:
# predict result with minimum mean cross-validated error
glm.pred = predict(cvfit, newx = test.feature, s = "lambda.min")
In [178]:
# cross validation result : root mean square error

mean((glm.pred - test.label)^2)^0.5
3.44734493595012